#install.packages('ggplot2')
library(ggplot2)
#install.packages('gridExtra')
library(gridExtra)
#install.packages('dplyr')
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:gridExtra':
## 
##     combine
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#install.packages('tidyr')
library(tidyr)
#install.packages('reshape2')
library(reshape2)
## Warning: package 'reshape2' was built under R version 3.4.3
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths

Read in the psuedo Facebook data.

#use sep to indicate file is tab seperated
pf <- read.csv('pseudo_facebook.tsv', sep = '\t')
names(pf)
##  [1] "userid"                "age"                  
##  [3] "dob_day"               "dob_year"             
##  [5] "dob_month"             "gender"               
##  [7] "tenure"                "friend_count"         
##  [9] "friendships_initiated" "likes"                
## [11] "likes_received"        "mobile_likes"         
## [13] "mobile_likes_received" "www_likes"            
## [15] "www_likes_received"

Friend count

Let’s look in to the friend count and see if there’s much of a difference in friend count between male and female users. We would like to omit the rows which have ‘NA’ as their gender by subseting the data to only include the rows that don’t have NA as gender.

qplot(x = friend_count, data = subset(pf, !is.na(gender)), binwidth = 10) +
  scale_x_continuous(limits = c(0, 1000),
                     breaks = seq(0, 1000, 50)) +
  facet_wrap(~gender)
## Warning: Removed 2949 rows containing non-finite values (stat_bin).

From this plot it looks like male users have a higher proportion of low friend count than female useers. Let’s look at the statistics of friend count by gender to get a clearer idea of the difference in friend count.

table(pf$gender)
## 
## female   male 
##  40254  58574
by(pf$friend_count, pf$gender, summary)
## pf$gender: female
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      37      96     242     244    4923 
## -------------------------------------------------------- 
## pf$gender: male
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##       0      27      74     165     182    4917

Both the median and mean friend count for female users is higher than for males. The median friend count will be a more robust statistic in this case as extreme outliers may skew the mean friend count.

Visulaising the differnce in friend count for male and female users using box plots. We will adjust the plot to focus on users who have friend counts between 0 and 1000.

ggplot(aes(x = gender, y=friend_count), 
       data = subset(pf, !is.na(gender))) +
  geom_boxplot() +
  coord_cartesian(ylim=c(0,1000))

Proportion of friend count

Comparing male and female friend counts in an overlay with a frequency polygon rather than two histograms side by side. Change the y axis to show prportions instead of counts.

ggplot(aes(x = friend_count, y = ..count../sum(..count..)),
       data = subset(pf, !is.na(gender))) +
  geom_freqpoly(aes(color = gender), binwidth=10) +
  scale_x_continuous(limits = c(0, 1000), breaks = seq(0, 1000, 50)) +
  xlab('Friend Count') +
  ylab('Proportion of users with that friend count')
## Warning: Removed 2949 rows containing non-finite values (stat_bin).
## Warning: Removed 4 rows containing missing values (geom_path).

A higher proportion of male users have a low friend count than women do. Towards the higher end of friend count (300+) male and female users seem to be close.

Let’s look in to some of the factors that might affect friend count.

Tenure

Friend count will be affected by how long a user has been a member of Facebook so let’s plot the tenure for Facebook users.

qplot(x=tenure/365, data = pf, binwidth=0.25, 
      xlab ='Number of years using Facebook',
      ylab='Number of users in sample',
      color=I('black'), fill=I('#F79420') )
## Warning: Removed 2 rows containing non-finite values (stat_bin).

The majority of users have been on Facebook for less than 2.5 years. However, those that have been on Facebook longer will likely have gained more friends over time than the newer users.

User Ages

Let’s see what age groups use Facebook.

summary(pf$age) #give the min and max ages help determine scale for x axis
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
qplot(x=age, data=pf, binwidth=1, xlab = "age", 
      ylab='Number of users in sample',
      color=I('black'), fill=I('#5760AB')) +
      scale_x_continuous(breaks = seq(13,113,5)) #seq(0,113,5) goes from 0 to 113 in units of 5 on x axis

The summary shows the minimum age is 13 and the max is 113. This is useful as you can’t use facebook if you’re younger than 13 and we’re not confident about the data we have for the over 90s are truthful.

Now let’s see if friend count varies by the age of users on Facebook.

ggplot(aes(x=age, y=friend_count), data =pf) + geom_jitter(alpha=1/20) + 
  xlim(13, 90)
## Warning: Removed 5175 rows containing missing values (geom_point).

summary(pf$age)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   13.00   20.00   28.00   37.28   50.00  113.00
  1. These users are likely to be teenagers or fake accounts given the huge friend counts.

Transform the plot using square root:

ggplot(aes(x=age, y=friend_count), data =pf) + 
  geom_point(alpha=1/20, position = position_jitter(h=0)) + 
  xlim(13, 90) +
  coord_trans(y = 'sqrt') #transform y axis to square root values
## Warning: Removed 5192 rows containing missing values (geom_point).

With this plot it’s much easier to see the distribution of friend count conditional on age. For example we can see threshold count above which there are very few users.

We can split up the datframe and apply a function (to find the mean and median) to some parts of the data. It can be helpful to not just plot all points but to plot against the mean or median friend count for each age as a reference.

#group dataframe by age and store it in age_groups
age_groups <- group_by(pf, age)

# summarise this new grouping of data and create new variables of mean
# friend count, median friend count & the number of people in each group
# Save all of this in a new variable pf.fc_by_age
pf.fc_by_age <- summarise(age_groups, 
          friend_count_mean = mean(friend_count),
          friend_count_median = median(friend_count),
          n = n()) #number of users in each group. 
# n fxn can only be used in summarise and it reports how many people are in each group

#Order data by age
pf.fc_by_age <- arrange(pf.fc_by_age, age)

#print first few rows of dataframe
head(pf.fc_by_age)
## # A tibble: 6 x 4
##     age friend_count_mean friend_count_median     n
##   <int>             <dbl>               <dbl> <int>
## 1    13          164.7500                74.0   484
## 2    14          251.3901               132.0  1925
## 3    15          347.6921               161.0  2618
## 4    16          351.9371               171.5  3086
## 5    17          350.3006               156.0  3283
## 6    18          331.1663               162.0  5196

Overlay plots of the mean and the 10th, 90th and 50th percent quartile in order to give some more context to the graph.

library(ggplot2)
ggplot(aes(x=age, y= friendships_initiated), data = pf) +
  geom_point(alpha = 1/10, position = position_jitter(h=0), color='orange') +
  coord_cartesian(xlim = c(13, 90), ylim = c(0, 1000)) +
  geom_line(stat = 'summary', fun.y = mean) +
  geom_line(stat= 'summary', fun.y = quantile, fun.args = list(probs = .1), linetype =2, color='blue') +
  geom_line(stat= 'summary', fun.y = quantile, fun.args = list(probs = .9), linetype =2, color='blue') +
  geom_line(stat= 'summary', fun.y = quantile, fun.args = list(probs = .5), color='blue')

# fun.args = list(probs = .9) gives the 90th percent quantile, linetype=2 makes it dashed 

Correlation between age and friend count

If the correlation coefficient is close to 1, it would indicate that the variables are positively linearly related and the scatter plot falls almost along a straight line with positive slope. A rule of thumb is that a correlation greater than 0.3 or less than -0.3 is meaningful, but small. around 0.5 is moderatre and 0.7 is large.

#the default method for computing the correlation coefficient is Pearson. 
cor.test(pf$age, pf$friend_count)
## 
##  Pearson's product-moment correlation
## 
## data:  pf$age and pf$friend_count
## t = -8.6268, df = 99001, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.03363072 -0.02118189
## sample estimates:
##         cor 
## -0.02740737

The correlation between age and friend count: -0.027 . This implies that there isn’t a meaningful relationship between age and friend count.

Correlation on Subsets

Based on the correlation co efficient and the above plot we just observed that the relationship between age and friend count is not linear. It isn’t monotonic, either increasing or decreasing. Furthermore, based on the plot we know that we maybe don’t want to include the older ages in our correlation number since older ages are likely to be incorrect. Lets redo the correlation calculation for users aged 70 or less:

with(subset(pf, age<=70), cor.test(age, friend_count))
## 
##  Pearson's product-moment correlation
## 
## data:  age and friend_count
## t = -52.592, df = 91029, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.1780220 -0.1654129
## sample estimates:
##        cor 
## -0.1717245

This tells a different story about a negative relationship between age and friend count. As age increases friend count decreases. The pearson product measures the strength of a relationship between any two variables. But there can be lots of other types of relationships.


Comparing age, gender and friend count

Boxplot of ages by gender. Here we can see the averages by a shape marked ‘x’. Since male users are a bit younger we might think a simple male to female comparison doesn’t capture the substantial differences in friend count.

ggplot(aes(x=gender, y=age), data= subset(pf, !is.na(gender))) + geom_boxplot() +
  stat_summary(fun.y=mean, geom = 'point', shape=4)

Lets make a graph of median friend count:

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(gender))) + 
  geom_line( aes(color=gender), stat='summary', fun.y=median)

We can see that nearly everywhere the median friend count is larger for women than it is for men. There are some exceptions which include the noise around the old users. We’re not really confident about these reported older ages. Users reporting age 70 seem to more or less have the same friend count regardless of gender.

We can create a data frame that contains information on each age AND gender group along with the mean and median friend counts.

new_groupings <- group_by(pf, age, gender)
pf.fc_by_age_gender <- summarise(new_groupings,
                                 mean_friend_count = mean(friend_count),
                                 median_friend_count = median(friend_count),
                                 n=n())
head(pf.fc_by_age_gender)
## # A tibble: 6 x 5
## # Groups:   age [3]
##     age gender mean_friend_count median_friend_count     n
##   <int> <fctr>             <dbl>               <dbl> <int>
## 1    13 female          259.1606               148.0   193
## 2    13   male          102.1340                55.0   291
## 3    14 female          362.4286               224.0   847
## 4    14   male          164.1456                92.5  1078
## 5    15 female          538.6813               276.0  1139
## 6    15   male          200.6658               106.5  1478
ggplot(aes(x=age, y=median_friend_count), data = subset(pf.fc_by_age_gender, !is.na(gender))) +
  geom_line(aes(color=gender))

It seems like the largest difference between friend count for male and female occurs between young users.


Thinking in Ratios

Let’s answer the question how many more times friends does the average female user have than the male user. To answer that question we need to reshape our data. Right now the data is in long format. We have many rows and the variables that we grouped over, male and female, have been repeated for each year. Let’s convert from long format to wide format. This new dataframe will have one row for each age and then we’ll put the median friend count inside of rows for males and females.

pf.fc_by_age_gender.wide2 <-
  subset(pf.fc_by_age_gender[c('age', 'gender', 'median_friend_count')],
         !is.na(gender)) %>%
  spread(gender, median_friend_count) %>%
  mutate(ratio = male / female)

head(pf.fc_by_age_gender.wide2)
## # A tibble: 6 x 4
## # Groups:   age [6]
##     age female  male     ratio
##   <int>  <dbl> <dbl>     <dbl>
## 1    13  148.0  55.0 0.3716216
## 2    14  224.0  92.5 0.4129464
## 3    15  276.0 106.5 0.3858696
## 4    16  258.5 136.0 0.5261122
## 5    17  245.5 125.0 0.5091650
## 6    18  243.0 122.0 0.5020576

Plot the ratio of the female to male median friend counts Add a horizontal line to the plot with a y intercept of 1, which will be the base line.

ggplot(aes(x=age, y=ratio), data=pf.fc_by_age_gender.wide2) +
  geom_line() +
  geom_hline(yintercept = 1, alpha=0.3, linetype=2)

Putting it all together - Tenure & Age

For friend count we need to take in to account how long people have been on Facebook as someone who’s been using it for a long time will likely have gained more friends. This data was collected in 2014 so we’re going to take that in to account. Create a variable called year_joined in the data frame using the variable tenure and 2014 as the reference year. Tenure variable = how many days since registering with Facebook.

pf$year_joined <- floor(2014-pf$tenure/365) 
# the floor() function to round down to the nearest integer

Let’s look at a summary of the data on year joined

summary(pf$year_joined)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    2005    2012    2012    2012    2013    2014       2
#table format of year joined data
table(pf$year_joined)
## 
##  2005  2006  2007  2008  2009  2010  2011  2012  2013  2014 
##     9    15   581  1507  4557  5448  9860 33366 43588    70

There’s not much info on the earlier years - 2005 & 2006. To increase the info we have in each tenure category we can group some of this data together. We’re going to create 4 bins for the years. The bins will be (i) 2004-2009 (ii) 2009 - 2011 (iii) 2011 - 2012 (iv) 2012 - 2014

pf$year_joined_bucket <- cut(pf$year_joined, c(2004, 2009, 2011, 2012, 2014))
table(pf$year_joined_bucket)
## 
## (2004,2009] (2009,2011] (2011,2012] (2012,2014] 
##        6669       15308       33366       43658

We can graph friend_count vs. age so that each year_joined.bucket is a line tracking the median user friend_count across age. To put these cohorts in to prespective we can plot the grand mean on here as well. The grand mean is the overall mean of friend count vs age.

ggplot(aes(x = age, y = friend_count),
       data = subset(pf, !is.na(year_joined_bucket))) +
  geom_line(aes(color = year_joined_bucket), stat = 'summary', fun.y = mean) +
  geom_line(stat = 'summary', fun.y = mean, linetype=2)

We can see that users with longer tenure tend to have higher friend counts with the exception of older users of about 80+. We might increase our confidence that this observation isn’t just an artefact of the time users have had to accumulate friends.

Friending rate

We could also look at tenure and friend count as a rate instead. For example we could see how many friends does a user have for each day since using the service. Subset the data so you only consider users that have at least one day of tenure

with(subset(pf, tenure>=1), summary(friend_count/tenure))
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
##   0.0000   0.0775   0.2205   0.6096   0.5658 417.0000

Do new users go on a friending spree? Or do users with greater tenure friend more people? Plot friendships initiated per day vs. tenure.

ggplot(aes(x = tenure, y = friendships_initiated/tenure),data = subset(pf, tenure>=1)) +
  geom_line(aes(color = year_joined_bucket), stat = "summary", fun.y = mean) 

It looks like users with more tenure typically initiate less friendships.

There’s a lot of noise in our graph since we’re plotting the mean of y for every possible tenure x value. We can reduce this noise by increasing the bin width on the x axis.

p1 <- ggplot(aes(x = tenure, y = friendships_initiated / tenure),
       data = subset(pf, tenure >= 1)) +
  geom_line(aes(color = year_joined_bucket),
            stat = 'summary',
            fun.y = mean)

p2 <- ggplot(aes(x = 7 * round(tenure / 7), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary",
            fun.y = mean)

p3 <- ggplot(aes(x = 30 * round(tenure / 30), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary",
            fun.y = mean)

p4 <- ggplot(aes(x = 90 * round(tenure / 90), y = friendships_initiated / tenure),
       data = subset(pf, tenure > 0)) +
  geom_line(aes(color = year_joined_bucket),
            stat = "summary",
            fun.y = mean)
grid.arrange(p1, p2, p3, p4, ncol=1)


Summary